{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "aecb7fc7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting gensim\n",
      "  Downloading gensim-4.3.3-cp310-cp310-win_amd64.whl.metadata (8.2 kB)\n",
      "Collecting numpy<2.0,>=1.18.5 (from gensim)\n",
      "  Downloading numpy-1.26.4-cp310-cp310-win_amd64.whl.metadata (61 kB)\n",
      "Collecting scipy<1.14.0,>=1.7.0 (from gensim)\n",
      "  Downloading scipy-1.13.1-cp310-cp310-win_amd64.whl.metadata (60 kB)\n",
      "Collecting smart-open>=1.8.1 (from gensim)\n",
      "  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)\n",
      "Collecting wrapt (from smart-open>=1.8.1->gensim)\n",
      "  Downloading wrapt-1.17.2-cp310-cp310-win_amd64.whl.metadata (6.5 kB)\n",
      "Downloading gensim-4.3.3-cp310-cp310-win_amd64.whl (24.0 MB)\n",
      "   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--\n",
      "   - -------------------------------------- 0.8/24.0 MB 6.7 MB/s eta 0:00:04\n",
      "   ------------ --------------------------- 7.3/24.0 MB 22.6 MB/s eta 0:00:01\n",
      "   ----------------------------- ---------- 17.6/24.0 MB 33.5 MB/s eta 0:00:01\n",
      "   ---------------------------------------- 24.0/24.0 MB 35.3 MB/s eta 0:00:00\n",
      "Downloading numpy-1.26.4-cp310-cp310-win_amd64.whl (15.8 MB)\n",
      "   ---------------------------------------- 0.0/15.8 MB ? eta -:--:--\n",
      "   -------------------------- ------------- 10.5/15.8 MB 50.4 MB/s eta 0:00:01\n",
      "   ---------------------------------------- 15.8/15.8 MB 47.4 MB/s eta 0:00:00\n",
      "Downloading scipy-1.13.1-cp310-cp310-win_amd64.whl (46.2 MB)\n",
      "   ---------------------------------------- 0.0/46.2 MB ? eta -:--:--\n",
      "   ------- -------------------------------- 8.7/46.2 MB 41.3 MB/s eta 0:00:01\n",
      "   --------------- ------------------------ 18.4/46.2 MB 44.6 MB/s eta 0:00:01\n",
      "   ------------------------ --------------- 28.3/46.2 MB 46.1 MB/s eta 0:00:01\n",
      "   --------------------------------- ------ 38.5/46.2 MB 47.1 MB/s eta 0:00:01\n",
      "   ---------------------------------------  46.1/46.2 MB 47.4 MB/s eta 0:00:01\n",
      "   ---------------------------------------- 46.2/46.2 MB 44.6 MB/s eta 0:00:00\n",
      "Downloading smart_open-7.1.0-py3-none-any.whl (61 kB)\n",
      "Downloading wrapt-1.17.2-cp310-cp310-win_amd64.whl (38 kB)\n",
      "Installing collected packages: wrapt, numpy, smart-open, scipy, gensim\n",
      "  Attempting uninstall: numpy\n",
      "    Found existing installation: numpy 2.1.2\n",
      "    Uninstalling numpy-2.1.2:\n",
      "      Successfully uninstalled numpy-2.1.2\n",
      "Successfully installed gensim-4.3.3 numpy-1.26.4 scipy-1.13.1 smart-open-7.1.0 wrapt-1.17.2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  WARNING: Failed to remove contents in a temporary directory 'D:\\Python310\\Lib\\site-packages\\~umpy.libs'.\n",
      "  You can safely remove it manually.\n",
      "  WARNING: Failed to remove contents in a temporary directory 'D:\\Python310\\Lib\\site-packages\\~umpy'.\n",
      "  You can safely remove it manually.\n"
     ]
    }
   ],
   "source": [
    "!pip install gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a9a7eb15",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting numpy==1.23.5\n",
      "  Downloading numpy-1.23.5-cp310-cp310-win_amd64.whl.metadata (2.3 kB)\n",
      "Downloading numpy-1.23.5-cp310-cp310-win_amd64.whl (14.6 MB)\n",
      "   ---------------------------------------- 0.0/14.6 MB ? eta -:--:--\n",
      "    --------------------------------------- 0.3/14.6 MB ? eta -:--:--\n",
      "   ----- ---------------------------------- 2.1/14.6 MB 9.0 MB/s eta 0:00:02\n",
      "   ------------------------------ --------- 11.3/14.6 MB 27.1 MB/s eta 0:00:01\n",
      "   ---------------------------------------- 14.6/14.6 MB 28.8 MB/s eta 0:00:00\n",
      "Installing collected packages: numpy\n",
      "  Attempting uninstall: numpy\n",
      "    Found existing installation: numpy 1.26.4\n",
      "    Uninstalling numpy-1.26.4:\n",
      "      Successfully uninstalled numpy-1.26.4\n",
      "Successfully installed numpy-1.23.5\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  WARNING: Failed to remove contents in a temporary directory 'D:\\Python310\\Lib\\site-packages\\~-mpy.libs'.\n",
      "  You can safely remove it manually.\n",
      "  WARNING: Failed to remove contents in a temporary directory 'D:\\Python310\\Lib\\site-packages\\~-mpy'.\n",
      "  You can safely remove it manually.\n"
     ]
    }
   ],
   "source": [
    "pip install numpy==1.23.5"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "67c1e6b1",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "下面的例子将展示词向量标准工具包——gensim提供的词嵌入，并展示词嵌入如何表示词的相似度。\n",
    "<!-- https://nlp.stanford.edu/projects/glove/ -->"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5c5a740a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pprint\n",
    "\n",
    "from gensim.models import KeyedVectors\n",
    "\n",
    "# 从GloVe官网下载GloVe向量，此处使用的是glove.6B.zip\n",
    "# 解压缩zip文件并将以下路径改为解压后对应文件的路径\n",
    "model = KeyedVectors.load_word2vec_format('D:/Desktop/社会舆情分析/glove.6B.100d.txt', binary=False, no_header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "01a2e4a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('movie', 0.9055121541023254),\n",
      " ('films', 0.8914433717727661),\n",
      " ('directed', 0.8124362826347351),\n",
      " ('documentary', 0.8075793981552124),\n",
      " ('drama', 0.7929168939590454),\n",
      " ('movies', 0.7889865040779114),\n",
      " ('comedy', 0.7842751145362854),\n",
      " ('starring', 0.7573285102844238),\n",
      " ('cinema', 0.7419455647468567),\n",
      " ('hollywood', 0.7307389974594116)]\n",
      "[('vehicle', 0.8630837798118591),\n",
      " ('truck', 0.8597878813743591),\n",
      " ('cars', 0.837166965007782),\n",
      " ('driver', 0.8185911178588867),\n",
      " ('driving', 0.781263530254364),\n",
      " ('motorcycle', 0.7553156614303589),\n",
      " ('vehicles', 0.7462257146835327),\n",
      " ('parked', 0.74594646692276),\n",
      " ('bus', 0.737270712852478),\n",
      " ('taxi', 0.7155269384384155)]\n"
     ]
    }
   ],
   "source": [
    "# 使用most_similar()找到词表中距离给定词最近（最相似）的n个词\n",
    "pprint.pprint(model.most_similar('film'))\n",
    "pprint.pprint(model.most_similar('car'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8b62f7ad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "japanese\n",
      "panda\n",
      "longest\n",
      "terrible\n",
      "queen\n"
     ]
    }
   ],
   "source": [
    "# 利用GloVe展示一个类比的例子\n",
    "def analogy(x1, x2, y1):\n",
    "    # 寻找top-N最相似的词。\n",
    "    result = model.most_similar(positive=[y1, x2], negative=[x1])\n",
    "    return result[0][0]\n",
    "\n",
    "print(analogy('china', 'chinese', 'japan'))\n",
    "print(analogy('australia', 'koala', 'china'))\n",
    "print(analogy('tall', 'tallest', 'long'))\n",
    "print(analogy('good', 'fantastic', 'bad'))\n",
    "print(analogy('man', 'woman', 'king'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c308cee",
   "metadata": {},
   "source": [
    "下面将展示word2vec的代码，包括文本预处理、skipgram算法的实现、以及使用PyTorch进行优化。这里使用《小王子》这本书作为训练语料。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "590fc408",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 安装NLTK，使用如下代码下载punkt组件\n",
    "#import nltk\n",
    "#nltk.download('punkt')\n",
    "\n",
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
    "from collections import defaultdict\n",
    "\n",
    "# 使用类管理数据对象，包括文本读取、文本预处理等\n",
    "class TheLittlePrinceDataset:\n",
    "    def __init__(self, tokenize=True):\n",
    "        # 利用NLTK函数进行分句和分词\n",
    "        text = open('D:/Desktop/社会舆情分析/the little prince.txt', 'r', encoding='utf-8').read()\n",
    "        if tokenize:\n",
    "            self.sentences = sent_tokenize(text.lower())\n",
    "            self.tokens = [word_tokenize(sent) for sent in self.sentences]\n",
    "        else:\n",
    "            self.text = text\n",
    "\n",
    "    def build_vocab(self, min_freq=1):\n",
    "        # 统计词频\n",
    "        frequency = defaultdict(int)\n",
    "        for sentence in self.tokens:\n",
    "            for token in sentence:\n",
    "                frequency[token] += 1\n",
    "        self.frequency = frequency\n",
    "\n",
    "        # 加入<unk>处理未登录词，加入<pad>用于对齐变长输入进而加速\n",
    "        self.token2id = {'<unk>': 1, '<pad>': 0}\n",
    "        self.id2token = {1: '<unk>', 0: '<pad>'}\n",
    "        for token, freq in sorted(frequency.items(), key=lambda x: -x[1]):\n",
    "            # 丢弃低频词\n",
    "            if freq > min_freq:\n",
    "                self.token2id[token] = len(self.token2id)\n",
    "                self.id2token[len(self.id2token)] = token\n",
    "            else:\n",
    "                break\n",
    "\n",
    "    def get_word_distribution(self):\n",
    "        distribution = np.zeros(vocab_size)\n",
    "        for token, freq in self.frequency.items():\n",
    "            if token in dataset.token2id:\n",
    "                distribution[dataset.token2id[token]] = freq\n",
    "            else:\n",
    "                # 不在词表中的词按<unk>计算\n",
    "                distribution[1] += freq\n",
    "        distribution /= distribution.sum()\n",
    "        return distribution\n",
    "\n",
    "    # 将分词结果转化为索引表示\n",
    "    def convert_tokens_to_ids(self, drop_single_word=True):\n",
    "        self.token_ids = []\n",
    "        for sentence in self.tokens:\n",
    "            token_ids = [self.token2id.get(token, 1) for token in sentence]\n",
    "            # 忽略只有一个token的序列，无法计算loss\n",
    "            if len(token_ids) == 1 and drop_single_word:\n",
    "                continue\n",
    "            self.token_ids.append(token_ids)\n",
    "        \n",
    "        return self.token_ids\n",
    "\n",
    "dataset = TheLittlePrinceDataset()\n",
    "dataset.build_vocab(min_freq=1)\n",
    "sentences = dataset.convert_tokens_to_ids()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "efc882de",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(76044, 2) [[  4  16]\n",
      " [  4  19]\n",
      " [ 16   4]\n",
      " ...\n",
      " [130   3]\n",
      " [  3  86]\n",
      " [  3 130]]\n"
     ]
    }
   ],
   "source": [
    "# 遍历所有的中心词-上下文词对\n",
    "window_size = 2\n",
    "data = []\n",
    "\n",
    "for sentence in sentences:\n",
    "    for i in range(len(sentence)):\n",
    "        for j in range(i-window_size, i+window_size+1):\n",
    "            if j == i or j < 0 or j >= len(sentence):\n",
    "                continue\n",
    "            center_word = sentence[i]\n",
    "            context_word = sentence[j]\n",
    "            data.append([center_word, context_word])\n",
    "\n",
    "# 需要提前安装numpy\n",
    "import numpy as np\n",
    "data = np.array(data)\n",
    "print(data.shape, data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "30903b3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 需要提前安装PyTorch\n",
    "import torch\n",
    "from torch import nn\n",
    "import torch.nn.functional as F\n",
    "\n",
    "# 实现skipgram算法，使用对比学习计算损失\n",
    "class SkipGramNCE(nn.Module):\n",
    "    def __init__(self, vocab_size, embed_size, distribution,\\\n",
    "                 neg_samples=20):\n",
    "        super(SkipGramNCE, self).__init__()\n",
    "        print(f'vocab_size = {vocab_size}, embed_size = {embed_size}, '+\\\n",
    "              f'neg_samples = {neg_samples}')\n",
    "        self.input_embeddings = nn.Embedding(vocab_size, embed_size)\n",
    "        self.output_embeddings = nn.Embedding(vocab_size, embed_size)\n",
    "        distribution = np.power(distribution, 0.75)\n",
    "        distribution /= distribution.sum()\n",
    "        self.distribution = torch.tensor(distribution)\n",
    "        self.neg_samples = neg_samples\n",
    "        \n",
    "    def forward(self, input_ids, labels):\n",
    "        i_embed = self.input_embeddings(input_ids)\n",
    "        o_embed = self.output_embeddings(labels)\n",
    "        batch_size = i_embed.size(0)\n",
    "        n_words = torch.multinomial(self.distribution, batch_size * \\\n",
    "            self.neg_samples, replacement=True).view(batch_size, -1)\n",
    "        n_embed = self.output_embeddings(n_words)\n",
    "        pos_term = F.logsigmoid(torch.sum(i_embed * o_embed, dim=1))\n",
    "        # 负采样，用于对比学习\n",
    "        neg_term = F.logsigmoid(- torch.bmm(n_embed, \\\n",
    "            i_embed.unsqueeze(2)).squeeze())\n",
    "        neg_term = torch.sum(neg_term, dim=1)\n",
    "        loss = - torch.mean(pos_term + neg_term)\n",
    "        return loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "1d9da6c8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.00000000e+00 5.43983724e-02 5.34295679e-02 ... 9.68804495e-05\n",
      " 9.68804495e-05 9.68804495e-05]\n",
      "vocab_size = 1078, embed_size = 128, neg_samples = 20\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "epoch-99, loss=2.4168: 100%|█| 100/100 [03:24<00:00,  2.05s/\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjIAAAGwCAYAAACzXI8XAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjEsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvc2/+5QAAAAlwSFlzAAAPYQAAD2EBqD+naQAASLpJREFUeJzt3Xl0VPXdx/HPTCYzWSYL2VkSloCy74sBVFQqiEVUrIq04l4V3Kit+li1Hp8Wq33cEbtYaKsWpVYUd0RFUfZ9kzWQQBbWZLIvM/f5I2QgkkBIZuZmwvt1zpyT3Htn8s3VzHz4rRbDMAwBAAAEIavZBQAAADQVQQYAAAQtggwAAAhaBBkAABC0CDIAACBoEWQAAEDQIsgAAICgZTO7AH/zeDzKyclRVFSULBaL2eUAAIBGMAxDRUVFateunazWhttdWn2QycnJUWpqqtllAACAJsjOzlaHDh0aPN/qg0xUVJSkmhsRHR1tcjUAAKAxXC6XUlNTvZ/jDWn1Qaa2Oyk6OpogAwBAkDndsBAG+wIAgKBFkAEAAEGLIAMAAIIWQQYAAAQtggwAAAhaBBkAABC0CDIAACBoEWQAAEDQIsgAAICgRZABAABBiyADAACCFkEGAAAELYJMExVXVCv7SKmOllSaXQoAAGctgkwTPfH+Zp3/zFeauzLb7FIAADhrEWSaKCY8VJLkKq8yuRIAAM5eBJkmig63SZIKywgyAACYhSDTRN4WGYIMAACmIcg0UXRYTZChRQYAAPMQZJro+BiZapMrAQDg7EWQaaJoupYAADAdQaaJGCMDAID5CDJNdOKsJcMwTK4GAICzE0GmiWoH+1Z7DJVVuU2uBgCAsxNBpoki7CGyWS2SmLkEAIBZTA0yv/vd72SxWOo8unfv7j1fXl6uqVOnKj4+Xk6nUxMnTlR+fr6JFR9nsVhOGPDLzCUAAMxgeotMr169lJub630sWbLEe+6BBx7QggULNG/ePC1evFg5OTm6+uqrTay2LrYpAADAXDbTC7DZlJKSctLxwsJCvf7663rrrbd08cUXS5Jmz56tHj16aNmyZTrvvPMCXepJosOODfgtJcgAAGAG01tkduzYoXbt2qlLly6aPHmysrKyJEmrV69WVVWVRo8e7b22e/fuSktL09KlSxt8vYqKCrlcrjoPf4mmRQYAAFOZGmSGDRumOXPm6NNPP9WsWbOUmZmp888/X0VFRcrLy5PdbldsbGyd5yQnJysvL6/B15wxY4ZiYmK8j9TUVL/VXxtkGOwLAIA5TO1auuyyy7xf9+3bV8OGDVPHjh31zjvvKDw8vEmv+cgjj2j69One710ul9/CTAyDfQEAMJXpXUsnio2N1TnnnKOdO3cqJSVFlZWVKigoqHNNfn5+vWNqajkcDkVHR9d5+AsbRwIAYK4WFWSKi4u1a9cutW3bVoMGDVJoaKgWLVrkPb9t2zZlZWUpIyPDxCqPY9YSAADmMrVr6cEHH9T48ePVsWNH5eTk6IknnlBISIgmTZqkmJgY3XrrrZo+fbri4uIUHR2te+65RxkZGS1ixpJUd5sCAAAQeKYGmX379mnSpEk6fPiwEhMTNXLkSC1btkyJiYmSpOeff15Wq1UTJ05URUWFxowZo1dffdXMkutg40gAAMxlapCZO3fuKc+HhYVp5syZmjlzZoAqOjOMkQEAwFwtaoxMsKmdfl1UzqwlAADMQJBphhjWkQEAwFQEmWao3aKguKJa1W6PydUAAHD2Icg0Q23XklQTZgAAQGARZJohNMSqCHuIJLqXAAAwA0GmmdimAAAA8xBkmokp2AAAmIcg00xsUwAAgHkIMs3ENgUAAJiHINNM0WxTAACAaQgyzcQYGQAAzEOQaaZoxsgAAGAagkwzHd+mgOnXAAAEGkGmmWq3KWCMDAAAgUeQaSY2jgQAwDwEmWZijAwAAOYhyDQTWxQAAGAegkwznbiOjGEYJlcDAMDZhSDTTLUtMpVujyqqPSZXAwDA2YUg00yR9hCFWC2SGPALAECgEWSayWKxMAUbAACTEGR8IJop2AAAmIIg4wO1+y0xBRsAgMAiyPgAi+IBAGAOgowPRIfXjpFhLRkAAAKJIOMDtMgAAGAOgowPeMfIEGQAAAgogowPsN8SAADmIMj4ANOvAQAwB0HGB9g4EgAAcxBkfKB2ZV9aZAAACCyCjA/EMEYGAABTEGR8gDEyAACYgyDjA7UtMsUV1fJ4DJOrAQDg7EGQ8YGoY2NkDEMqKmfALwAAgUKQ8QGHLURhoTW3knEyAAAEDkHGR9imAACAwCPI+AjbFAAAEHgEGR+hRQYAgMAjyPgI+y0BABB4BBkfYZsCAAACjyDjI2xTAABA4BFkfIRtCgAACDyCjI+wTQEAAIFHkPER72BfggwAAAFDkPGR2nVkaJEBACBwCDI+Eh1eM9jXxV5LAAAEDEHGR1gQDwCAwCPI+AhbFAAAEHgEGR+JiagJMhXVHpVXuU2uBgCAswNBxkecdpsslpqvaZUBACAwCDI+YrVamLkEAECAEWR8iAG/AAAEFkHGhwgyAAAEFkHGh2IjCDIAAAQSQcaH2G8JAIDAajFB5umnn5bFYtH999/vPVZeXq6pU6cqPj5eTqdTEydOVH5+vnlFngZdSwAABFaLCDIrV67Un//8Z/Xt27fO8QceeEALFizQvHnztHjxYuXk5Ojqq682qcrTI8gAABBYpgeZ4uJiTZ48WX/961/Vpk0b7/HCwkK9/vrreu6553TxxRdr0KBBmj17tr7//nstW7bMxIobRpABACCwTA8yU6dO1eWXX67Ro0fXOb569WpVVVXVOd69e3elpaVp6dKlDb5eRUWFXC5XnUeg1AYZFsQDACAwbGb+8Llz52rNmjVauXLlSefy8vJkt9sVGxtb53hycrLy8vIafM0ZM2boySef9HWpjUKLDAAAgWVai0x2drbuu+8+vfnmmwoLC/PZ6z7yyCMqLCz0PrKzs3322qdDkAEAILBMCzKrV6/WgQMHNHDgQNlsNtlsNi1evFgvvfSSbDabkpOTVVlZqYKCgjrPy8/PV0pKSoOv63A4FB0dXecRKAQZAAACy7SupUsuuUQbN26sc+zmm29W9+7d9dBDDyk1NVWhoaFatGiRJk6cKEnatm2bsrKylJGRYUbJp0WQAQAgsEwLMlFRUerdu3edY5GRkYqPj/cev/XWWzV9+nTFxcUpOjpa99xzjzIyMnTeeeeZUfJp1S6IV17lUUW1Ww5biMkVAQDQupk62Pd0nn/+eVmtVk2cOFEVFRUaM2aMXn31VbPLalCUwyaLRTKMmlaZpCiCDAAA/mQxDMMwuwh/crlciomJUWFhYUDGy/R78nMVllXpi+kXqGtSlN9/HgAArVFjP79NX0emtWGcDAAAgUOQ8TGCDAAAgUOQ8TGCDAAAgUOQ8TFvkCklyAAA4G8EGR+L9rbIVJtcCQAArR9BxsfoWgIAIHAIMj5GkAEAIHAIMj5GkAEAIHAIMj5WG2RcBBkAAPyOIONjtMgAABA4BBkfI8gAABA4BBkfI8gAABA4BBkfqw0yZVVuVVZ7TK4GAIDWjSDjY1FhNlksNV/TKgMAgH8RZHzMarUoymGTRJABAMDfCDJ+EBPBOBkAAAKBIOMHrCUDAEBgEGT8gJlLAAAEBkHGDwgyAAAEBkHGDwgyAAAEBkHGD6IJMgAABARBxg9okQEAIDAIMn5AkAEAIDAIMn5AkAEAIDAIMn7AOjIAAAQGQcYPaJEBACAwCDJ+QJABACAwCDJ+UBtkSivdqnJ7TK4GAIDWiyDjB1Fhod6vaZUBAMB/CDJ+EGK1KCrMJokgAwCAPxFk/IRxMgAA+B9Bxk8IMgAA+B9Bxk9YSwYAAP8jyPgJLTIAAPgfQcZPvEGmlCADAIC/EGT8hBYZAAD8jyDjJ9EEGQAA/I4g4ye0yAAA4H8EGT8hyAAA4H8EGT8hyAAA4H8EGT9hHRkAAPyPIOMntMgAAOB/BBk/qQ0yJZVuVbk9JlcDAEDrRJDxk9rp1xLdSwAA+AtBxk9CrBZFOWyS6F4CAMBfCDJ+VNsqU0CQAQDALwgyfsSAXwAA/Isg40dMwQYAwL8IMn5EiwwAAP5FkPEjb5ApJcgAAOAPBBk/iomgRQYAAH8iyPiRd4xMOUEGAAB/IMj4kfPYOjIlFW6TKwEAoHUiyPhR5LEgU1RRbXIlAAC0TgQZPzreIkOQAQDAHwgyfhQVVhNkissJMgAA+ANBxo9qu5aKaZEBAMAvTA0ys2bNUt++fRUdHa3o6GhlZGTok08+8Z4vLy/X1KlTFR8fL6fTqYkTJyo/P9/Eis+MkyADAIBfmRpkOnTooKefflqrV6/WqlWrdPHFF2vChAnavHmzJOmBBx7QggULNG/ePC1evFg5OTm6+uqrzSz5jJwYZAzDMLkaAABaH4vRwj5h4+Li9Oyzz+qaa65RYmKi3nrrLV1zzTWSpB9++EE9evTQ0qVLdd555zXq9Vwul2JiYlRYWKjo6Gh/ln6S4opq9X7iM0nSD0+NVVhoSEB/PgAAwaqxn98tZoyM2+3W3LlzVVJSooyMDK1evVpVVVUaPXq095ru3bsrLS1NS5cubfB1Kioq5HK56jzMEhEaIoul5usiBvwCAOBzpgeZjRs3yul0yuFw6M4779R7772nnj17Ki8vT3a7XbGxsXWuT05OVl5eXoOvN2PGDMXExHgfqampfv4NGma1WhRpZwo2AAD+YnqQOffcc7Vu3TotX75cd911l6ZMmaItW7Y0+fUeeeQRFRYWeh/Z2dk+rPbMMeAXAAD/sZldgN1uV9euXSVJgwYN0sqVK/Xiiy/quuuuU2VlpQoKCuq0yuTn5yslJaXB13M4HHI4HP4uu9EiHTXjYuhaAgDA90xvkfkxj8ejiooKDRo0SKGhoVq0aJH33LZt25SVlaWMjAwTKzwzzrCajSPpWgIAwPdMbZF55JFHdNlllyktLU1FRUV666239PXXX+uzzz5TTEyMbr31Vk2fPl1xcXGKjo7WPffco4yMjEbPWGoJouhaAgDAb0wNMgcOHNCNN96o3NxcxcTEqG/fvvrss8/0k5/8RJL0/PPPy2q1auLEiaqoqNCYMWP06quvmlnyGavtWiLIAADge6YGmddff/2U58PCwjRz5kzNnDkzQBX5ntNR07VEkAEAwPda3BiZ1sZ5rEWGMTIAAPgeQcbPnMd2wGbWEgAAvkeQ8TO6lgAA8J8mBZl//OMf+uijj7zf/+Y3v1FsbKyGDx+uvXv3+qy41oCuJQAA/KdJQeYPf/iDwsPDJUlLly7VzJkz9cwzzyghIUEPPPCATwsMdrVdS7TIAADge02atZSdne1djXf+/PmaOHGi7rjjDo0YMUKjRo3yZX1Br3avJYIMAAC+16QWGafTqcOHD0uSPv/8c++6L2FhYSorK/Ndda2At0WGwb4AAPhck1pkfvKTn+i2227TgAEDtH37do0bN06StHnzZnXq1MmX9QU9No0EAMB/mtQiM3PmTGVkZOjgwYN69913FR8fL0lavXq1Jk2a5NMCgx1BBgAA/2lSi0xsbKxeeeWVk44/+eSTzS6otantWiqpqJZhGLJYLCZXBABA69GkFplPP/1US5Ys8X4/c+ZM9e/fXzfccIOOHj3qs+Jag9oWGY8hlVW5Ta4GAIDWpUlB5te//rVcLpckaePGjfrVr36lcePGKTMzU9OnT/dpgcEuPDRE1mONMAz4BQDAt5rUtZSZmamePXtKkt5991399Kc/1R/+8AetWbPGO/AXNSwWiyIdNhWVV6uoolpJZhcEAEAr0qQWGbvdrtLSUknSF198oUsvvVSSFBcX522pwXFRjuPjZAAAgO80qUVm5MiRmj59ukaMGKEVK1bo7bffliRt375dHTp08GmBrYEzzCYV0rUEAICvNalF5pVXXpHNZtN//vMfzZo1S+3bt5ckffLJJxo7dqxPC2wNIpmCDQCAXzSpRSYtLU0ffvjhSceff/75ZhfUGrGWDAAA/tGkICNJbrdb8+fP19atWyVJvXr10hVXXKGQkBCfFddaEGQAAPCPJgWZnTt3aty4cdq/f7/OPfdcSdKMGTOUmpqqjz76SOnp6T4tMtgRZAAA8I8mjZG59957lZ6eruzsbK1Zs0Zr1qxRVlaWOnfurHvvvdfXNQY9No4EAMA/mtQis3jxYi1btkxxcXHeY/Hx8Xr66ac1YsQInxXXWjiZfg0AgF80qUXG4XCoqKjopOPFxcWy2+3NLqq1qQ0yRQQZAAB8qklB5qc//anuuOMOLV++XIZhyDAMLVu2THfeeaeuuOIKX9cY9LzTr+laAgDAp5oUZF566SWlp6crIyNDYWFhCgsL0/Dhw9W1a1e98MILPi4x+EXV7oBdSZABAMCXmjRGJjY2Vu+//7527tzpnX7do0cPde3a1afFtRZOWmQAAPCLRgeZ0+1q/dVXX3m/fu6555peUSvEyr4AAPhHo4PM2rVrG3WdxWJpcjGtFevIAADgH40OMie2uODM0LUEAIB/NGmwL86M0zvY1y2PxzC5GgAAWg+CTADUtshIzFwCAMCXCDIB4LBZZbPWjB0qqXCbXA0AAK0HQSYALBbL8f2WKqpMrgYAgNaDIBMgkfZj2xQw4BcAAJ8hyASId3VfupYAAPAZgkyAHF8Uj64lAAB8hSATIMcXxaNFBgAAXyHIBIh3sG85LTIAAPgKQSZAnHa2KQAAwNcIMgFyfPo1XUsAAPgKQSZAGOwLAIDvEWQCJMrB9GsAAHyNIBMgtV1LLIgHAIDvEGQChK4lAAB8jyATIHQtAQDgewSZADneIkPXEgAAvkKQCRAnQQYAAJ8jyARIlHdlX4IMAAC+QpAJkNqupbIqt9wew+RqAABoHQgyARLpCPF+TfcSAAC+QZAJEIctRPaQmttNkAEAwDcIMgFUuyheCUEGAACfIMgEUO3MJVb3BQDANwgyARTpoEUGAABfIsgEUBRryQAA4FMEmQCqnbnEWjIAAPgGQSaAnGGhkmiRAQDAV0wNMjNmzNCQIUMUFRWlpKQkXXnlldq2bVuda8rLyzV16lTFx8fL6XRq4sSJys/PN6ni5mGbAgAAfMvUILN48WJNnTpVy5Yt08KFC1VVVaVLL71UJSUl3mseeOABLViwQPPmzdPixYuVk5Ojq6++2sSqm855rGuJwb4AAPiGzcwf/umnn9b5fs6cOUpKStLq1at1wQUXqLCwUK+//rreeustXXzxxZKk2bNnq0ePHlq2bJnOO+88M8puMqejpmupiCADAIBPtKgxMoWFhZKkuLg4SdLq1atVVVWl0aNHe6/p3r270tLStHTp0npfo6KiQi6Xq86jpWCwLwAAvtVigozH49H999+vESNGqHfv3pKkvLw82e12xcbG1rk2OTlZeXl59b7OjBkzFBMT432kpqb6u/RGi2JlXwAAfKrFBJmpU6dq06ZNmjt3brNe55FHHlFhYaH3kZ2d7aMKm692QTy6lgAA8A1Tx8jUmjZtmj788EN988036tChg/d4SkqKKisrVVBQUKdVJj8/XykpKfW+lsPhkMPh8HfJTeJkZV8AAHzK1BYZwzA0bdo0vffee/ryyy/VuXPnOucHDRqk0NBQLVq0yHts27ZtysrKUkZGRqDLbbbariWmXwMA4BumtshMnTpVb731lt5//31FRUV5x73ExMQoPDxcMTExuvXWWzV9+nTFxcUpOjpa99xzjzIyMoJuxpJ0vGuJwb4AAPiGqUFm1qxZkqRRo0bVOT579mzddNNNkqTnn39eVqtVEydOVEVFhcaMGaNXX301wJX6BgviAQDgW6YGGcMwTntNWFiYZs6cqZkzZwagIv+qDTIV1R5VuT0KDWkxY60BAAhKfJIGUG3XksSAXwAAfIEgE0ChIVaFhdbc8iLGyQAA0GwEmQCLPrYDdmFZlcmVAAAQ/AgyARYXaZckHS6pNLkSAACCH0EmwOKdNUHmSEmFyZUAABD8CDIBFh9Zs+rw4WJaZAAAaC6CTIDRtQQAgO8QZAIs/liQOUKLDAAAzUaQCbA4Jy0yAAD4CkEmwLxjZBjsCwBAsxFkAuz4rCVaZAAAaC6CTIDFMUYGAACfIcgEWO1g36KKalVUu02uBgCA4EaQCbDosFDZrBZJdC8BANBcBJkAs1otalO7lgzdSwAANAtBxgTetWRokQEAoFkIMiaII8gAAOATBBkTxDtr1pI5VMxaMgAANAdBxgR0LQEA4BsEGRPQtQQAgG8QZEzADtgAAPgGQcYECbUbRzJGBgCAZiHImCDu2MaRdC0BANA8BBkT0LUEAIBvEGRM4N1vqbz+/ZZW7z2iWV/vksdjBLo0AACCis3sAs5GMeGhCrFa5PYYOlpSpZSYkDrnH31vk37IK1Kf9jEa2S3BpCoBAGj5aJExgdVqUZuI2u6lugN+3R5Duw+WSJK25xcFvDYAAIIJQcYkDS2Kl1NQpkq3R5K0+1BxwOsCACCYEGRMEu+sfwfszEMl3q93HSgRAABoGEHGJA3NXNpz+Hh4oUUGAIBTI8iY5HjXUt0xMrXjYyQp31Wh4orqgNYFAEAwIciYpKFF8U5skZGkzIN0LwEA0BCCjElqx8gc+tEYmT3HxsjYbTX/aXYdpHsJAICGEGRMUt+spSq3R9lHyyRJw9PjJUm7CTIAADSIIGOSuHqCzL6jZXJ7DIWFWr1BZtchupYAAGgIQcYk8fXsgF3brdQpPlLpiU5JdQf/AgCAuggyJok/NtjXVV6tyuqaBfAyTwgyXY4FmcxDxey5BABAAwgyJqndb0mSjpbWdC/VzljqnBip1DbhCg2xqLzKo5zCMtPqBACgJSPImKRmv6VQScdX961tkekcHylbiFUd4yMl0b0EAEBDCDImOr66b804mdoWmU4JNQGmS0JtkGHmEgAA9SHImCj+hEXxKqs92n9s6nWnhAhJ8o6T2c3MJQAA6kWQMVHcCRtHZh0plceQIu0hSnTWBJwuiTUtMiyKBwBA/QgyJjpxUTzv1OuESFksNYOAmYINAMCpEWRMdOIYmR+Pj5Gk9GMtMrmF5SqtZPNIAAB+jCBjovhjXUiHiyvrzFiqFRth94YdWmUAADgZQcZEdbqW6mmRkY7PXGKcDAAAJyPImOjE/ZYyj7W4dD42Y6kW42QAAGiYzewCzma1LTK5heUqq3JLqtme4ES1M5eYgg0AwMlokTFR7RiZ2hATFWbzttLU8q4lQ9cSAAAnIciYKDY8VMe2W5IkdT5h6nUtb4vMwRI2jwQA4EcIMiaq2W/peAvMj7uVJCktLkI2q0VlVW7lucoDWR4AAC0eQcZkJ3Yl/XjGkiSFhliVFl8zAJgBvwAA1EWQMVm883iQ+fGMpVpdEmr3XGKcDAAAJyLImKx240ip/q4l6fgKv7TIAABQF0HGZCd2LXWup2tJYvNIAAAaQpAxWW2QiY0IVWyEvd5ruibVdC1tzy8KWF0AAAQDgozJEo6NkWmoW0mSzk2JliTluyp0pKQyIHUBABAMTA0y33zzjcaPH6927drJYrFo/vz5dc4bhqHHH39cbdu2VXh4uEaPHq0dO3aYU6yfnN8tUemJkbpuSGqD1zgdNnU8NnNpa64rUKUBANDimRpkSkpK1K9fP82cObPe888884xeeuklvfbaa1q+fLkiIyM1ZswYlZe3nvVUOiVEatGvRmnS0LRTXtc9JUoSQQYAgBOZutfSZZddpssuu6zec4Zh6IUXXtBvf/tbTZgwQZL0z3/+U8nJyZo/f76uv/76QJZquh5to/XZ5nxtIcgAAODVYsfIZGZmKi8vT6NHj/Yei4mJ0bBhw7R06dIGn1dRUSGXy1Xn0Rr0aFszTuaHXAb8AgBQq8UGmby8PElScnJynePJycnec/WZMWOGYmJivI/U1IbHngSTnseCzM4Dxapye0yuBgCAlqHFBpmmeuSRR1RYWOh9ZGdnm12ST3RoE64oh02Vbg/ryQAAcEyLDTIpKSmSpPz8/DrH8/Pzvefq43A4FB0dXefRGlgsFnVvy4BfAABO1GKDTOfOnZWSkqJFixZ5j7lcLi1fvlwZGRkmVmae2nEyWxknAwCAJJNnLRUXF2vnzp3e7zMzM7Vu3TrFxcUpLS1N999/v/73f/9X3bp1U+fOnfXYY4+pXbt2uvLKK80r2kTHgwwtMgAASCYHmVWrVumiiy7yfj99+nRJ0pQpUzRnzhz95je/UUlJie644w4VFBRo5MiR+vTTTxUWFmZWyaYiyAAAUJfFMAzD7CL8yeVyKSYmRoWFhUE/Xqas0q1eT3wqjyGtePQSJUWdnYEOAND6Nfbzu8WOkcHJwu0h6nRsh2zGyQAAQJAJOnQvAQBwHEEmyPQkyAAA4EWQCTI9WEsGAAAvgkyQqe1a2nWwROVVbpOrAQDAXASZIJMSHaaY8FC5PYZ2HmCrAgDA2Y0gE2QsFou3e2kL3UsAgLMcQSYIMXMJAIAaBJkgRJABAKAGQSYI9Txh88hWvjAzAACnRJAJQl2TnAqxWlRYVqXcwnKzywEAwDQEmSAUFhqi9MSarQo27Cs0uRoAAMxDkAlSw9MTJEmzFu+iewkAcNYiyASpuy9KV4Q9ROuzC/ThhlyzywEAwBQEmSCVFBWmX16QLkl65rMfVFHNKr8AgLMPQSaI3X5BZyVFOZR9pEz/WrrXZ687f+1+vfDFdrk9dFkBAFo2m9kFoOki7Db96tJz9NC7G/Xylzv1s0GpiokIlSS5PYZmf5ep/6zep0iHTcnRDiVFhSk5OkzndYnTgLQ29b7mAVe5Hpy3XtUeQx6PoemXnhvIXwkAgDNCkAly1wxK1d+X7NG2/CK98tUOPXp5T+09XKIH563Xyj1H632O3WbVN7++SCkxYSede3tltqqPtcS8/NVODeoUpwvPSfTr7wAAQFPRtRTkQqwWPTyuuyTpH9/v1cuLdmjsC99q5Z6jirSH6Hfje+rVyQP1xPieumtUurokRKqy2qPZ32We9Fpuj6F/r8iSJHVLcsowpAfeXqfcwrKA/k4AADQWQaYVGHVOokZ2TVCl26P/W7hdZVVuZXSJ16f3X6CbRnTWuD5tdfOIznpobHc9enkPSdKby7PkKq+q8zpf/XBAOYXlahMRqv/cNVy92kXrSEml7nlrrarcHjN+NQAATokg0wpYLBY9Mq677CFWhYVa9bvxPfXmbcOUGhdx0rUXnZukbklOFVdU663lWXXOvbG8ZsDwzwanKiY8VK9OHqgoh02r9h7Vnz7bJkkyDEMHXOVavvuw1mbV33UFAECgWIxWvpqay+VSTEyMCgsLFR0dbXY5frXnUIkiHCFKijp57MuJ5q3K1q//s0FJUQ59+9BFcthClH2kVBc8+5UMQ/r6wVHqlFCzcvCnm3J15xtrJNVsVpl1uEQllcener91+zDv4nwAAPhKYz+/aZFpRTolRJ42xEjShP7tlRzt0IGiCr2/NkeS9NaKLBmGdH63BG+IkaSxvdvqlhGdJdXstl1S6ZbVIkWH1YwTf+XLnX74TQAAaBxmLZ2F7Darbh3ZWX/4+Af9+ZtduqJ/O72zMluSNHlYx5Ou/59x3dUvNUbhoSHqkhip1LgIHSqu1IXPfKXvdx3WmqyjGljPdO53VmZr7sosvXLDQLWLDff77wUAOPvQInOWmjQ0TVEOm3YdLNFD727Q4ZJKJUc7NLpH0knX2kKsmtC/vS7tlaKuSVFy2ELUPjZcVw1oL0l69atdJz1n54Ei/Xb+Jq3JKtAby3y3WB8AACciyJylosJCNfm8mtaX99fVdC9dPyRNtpDG/y9x56h0WSzSF1vz9UOey3vc4zH08LsbVXlsptOnm/LY2BIA4BcEmbPYzSM6yX4suIRYLZo0NO2Mnp+e6NS43m0lSbO+Pt4q8+aKLK3aW7OOjT3Eqt2HSrTjQLHvCm+Bdh8s1rS31mhLjuv0FwMAfIYgcxZLjg7zdg+N7pFU70q/p3PXqJqNKxesz1HW4VLlFZbrj5/8IEn69ZhzNbJbzYymTzfl+ajqlumJDzbrww25+s2762l9AoAAIsic5R79aQ/9esy5eurK3k16fu/2MbrwnER5DGnW4l367fxNKq6o1oC0WP0io5PG9k6RJH3SioPMuuwCfbvjkCRp036XFm7JN7kiADh7EGTOctFhoZp6UddGTdtuyNSLukqS5q7M0hdb8xUaYtEfJ/ZViNWin/RIVojVoq25Lu09XOKrsluU2inoUY6aSYDPf7FDHnYOB4CAIMig2YZ2jtOQTm1U26Ny16iuOic5SpLUJtKu87rESWqd3Utbc136Ymu+LBZpzi1D5XTYtDXXpc82t77fFQBaIoIMfOKei7tJqtlscupF6XXOjT02ILg1di/N/KqmNebyPm01qGMb3TKikyTp+S+20yoDAAFAkIFPXHBOohZMG6l3fpkhhy2kzrkxPZNlsdSMJWlNO2nvOlisjzbmSjrevXbryC6KCrNpe/7xcwAA/yHIwGf6dIhRm0j7SceTosM06NjKv5/5qFWm2u1R0Y92724Kt8dQXmF5k5776le7ZBjST3omq0fbmn1AYiJCdfv5XSRJL3yxXW5aZQDAr9iiAAExtneKVu09qk825emmY3s3NUVJRbXeWLZXf/12tw4VVyoxyqFuSU51S3Kqa3KURp2TWO+u3/VxlVfptn+s0orMIzo3OUpX9G+nK/q1a9Tzs4+Uav66/ZKkacdaY2rdPKKTXl+SqV0HS7RgfY6uPDbF3ZfyCssVHW5ThJ0/YZxdqtweFZVXq01EqCwWS8B/fmFZlardHsU7HQH/2agfu18jIPYdLdXIP34lq0Va8ehoJZzhm0BhWZX++f0evf5dpgpKT90SM7RTnCYOaq9xfdoqKiy03msOF1foxr+v0OZ6FrAbkBar20Z20eV92zb4Mx59b6PeXJ6l87sl6F+3Djvp/MyvdurZz7apU3yE3p82UjHhJ9dR7fboT59v19qso7qifztN6N9eTsfpg8mHG3J039x1So5y6N27h6ttTMvcx6q8yi2HzWrKhw1ap/Iqt67781Kt31eo2IhQnZMcpXOSneqWFHXS305KTJiGp8f79P+//67Zp9/O3ySb1aKP7j2/0f9oamk8HkOvfbNLew6V6MEx5zZr1qo/NfbzmyCDgBn/8hJt3F+oGVf38a4i7PYYcpVV6UBRhfJd5cp3letAUYUOFlXoSEmlDpdU6HBxpbKOlKq00i1J6pwQqbtHpesnPZO153CpduQXaeeBYm3YV6hlmYe9s6fCQq0a2ytF1w5J1Xmd42W11ryh5RaW6ed/W65dB0sUH2nXzMkDtfdwiT5Yn6Pvdx1//h0XdNFDY7srxHr8jdAwDP3lm93646c/yGNI7/wyQ0M7x530u5ZUVOvCZ7/SoeJKpSdG6u83DVHH+Mg65+/591p9+cMB77EIe4gm9G+nSUPT1Kd9TL1vwB9tyNW9c9d6u6y6p0TpnTszFN1AYAuUskq3lmce1uYcl7bkuLQ5p1B7DpeqS0Kknriily48J9HU+szm8Rj6+3eZ2pZXpJ+f11H9UmPNLqnJjpZU6khppdITnQH/2Y+/v0n/XNr4vdtuGJampyb0rvM33BTFFdV6fP4m/Xftfu+xC85J1D9uHtKooOQqr9Lfvtmt/mmxurh7crNqaa6KarcenLdBC9bXbE2T4HTopev7a3jXBFPrqg9B5hiCTMtR20oREx6qqDCbCsuqVFxRrcb+H9gtyalpF3fVT/u2a/CNKbewTO+t3a93V+/TroPH163pGB+hawenaljnON03d532F5SpXUyY3rhtmLqc8IZ8wFWu17/L1J8X75YkXdI9SS9c319RYaFylVfpwXfW6/NjC95NGpqmGVf3abDeLTku3fqPlcotLFebiFD9+ReDNbRznPJd5bplzkptznHJYbPqxoyOWvTDAe0+od7BHdvoV5eeq4z0eO+xTzbmatq/a0LM5X3bakXmER0sqtCIrvGafdNQ2W3NH/JWUlGtTzblyemwqW+HGLWNCTvlG3VOQZn+uXSv/r0iS4VlDbeUjeuTosd+2vO0rUcfrM/RP77fo+k/OUcjAvDGahjGaT+INucUKiw0pMkf3EdKKvXA2+u0ePtB77Hh6fG6a1S6RnZNCIoWq/0FZVq4OU+fbc7Xij1H5PYYenRcD91+QZdGP//pT35QfKRdj17eQ6FnsKdbrU835enON1ZLkv7yi0Fq3yZc2/OLtC2vWLsOFquiumZvN4skj2Foyc5DMgxpQv92+tPP+jXpZ0rSpv2Fuuffa5V5qERWi3TT8M56Y/leVVZ79MJ1/U/bdVxW6daNf1+ulXuOSpLG9krRkxN6KTn61K0gldUebc8v0tHSSg3tHHfSJIoTX39t9lG1ibArNS7ilK26hWVVuvNfq7V092HZrBZ1aBOuPYdLZbFI917cTfde0q3Zoc+XCDLHEGRajr2HS3TJ/y1WdT0DYOMi7UqKcigpOkxJUQ4lRjkUH2lXvNOuuEiHEp0OdU+J8raqnI5hGFq/r1DvrMrWB+tyVFxRXed8l4RI/eu2YWofW/8H64L1OXpw3npVVHt0TrJTD43trqc+3KI9h0tlD7HqiSt66oahaaf9EDrgKtdt/1ylDfsKZQ+x6r7R3fTGsr3KLSxXfKRdf50yWAPT2sgwDK3IPKJ/r8jSxxvzvBtujuyaoF9deo7yXeWa9tZaVXsMXT2wvZ69pp+25rp03Z+XqqTSrasGtNdz1/Zr1ofilhyXpv17TZ1AleC0q0/7GJ2TEqW4CLtiI0IVE25XaIhF763dr0825Xlbh9rHhmtQxzbq1S5aPdtFq1N8pGZ/t0f/WLpHbo+hCHuI7rukm24a0emkN2WPx9Czn2/z7tkVH2nX5w9c4NdxCF9tO6DfvrdJnRIi9Kef9TspZBmGoVe/3qVnP9sme4hVL03q711KoLFW7z2iaW+tVW5huRw2qy48J1Ff/nDA+zfQu320uiQ4dbS0suZRUqWKao8SnHYlRYcp0elQUnTNOLDBHeOUGhfe5P/G1W6PQqyWRj2/oLRSK/cc1YrMw/p+1+F6u2Al6bGf9tStIxse82YYht5fl6PH3t+kovKav8Gf9EzWKzcMaPCDuT77jpZq3IvfylVerV9e0EWPjOtx2ucsWJ+jB95ep2qPodE9an5mWGjNzywsrdLnW/K0eu9RVVR7VOn2qKraoyq3R+VVHpVVuVVW6VZZlVs5BWWq9hhqFxOmFycN0JBOcXrlyx360+fbFRdp1xfTL1RcPZMcpJowcse/VunrbQcVYQ9RRbVHbo+hKIdNv7msuyYPTZPFIh0oqtCug8XafbBEm3Nc2rS/UNvyirzvA8nRDt1+fhdNGpqmyGNB5WhJpf6xdI/+8f0eHT2huz0u0q7UNuFKT3RqQFqsBqS10bkpUTpUXKGb/r5S2/KL5HTYNOvnAzW4Y5yeXLBZc1dmS5IyusTrzlHpNe/FUQ61ibA3+j3XHwgyxxBkWpatuS7lucoVEx7qfUSHhfqkNaEhpZXV+mhDrt5ema1Ve4+qZ9to/eOWoUqMOvWH5PrsAt3+z1U6UFThPdY+NlyvTh54Rl0DZZVuPfD2On16wiJ56YmRmnPz0Hr72PNd5Xrly52auzJLVe6aP0+rRfIY0tUD2uvZn/Xz/qtp8faDumXOSrk9hu64oItGdk3QroPF3jfFmPBQXd63rS7pnqxwe/0fHIZh6I1le/XUR1tVWe1RUpRD8U6HtucXNWrWVUaXeN0ysrMu7p5U77/mtuS49Nj7m7R6b82/SJOjHbrjgnTdMDRN4fYQFZVX6YG31+mLrTXdbHGRdh0pqdRlvVP06uSBPm+xqKh265lPt+n1JZneY3GRdj13bT+NOjdJUs1YjIfe3eDdGV6q+W/w9MS+unZw6kmvmVdYrl0Hi2UYkqGae7ZhX6GeX7hd1R5DXRIiNXPyQPVoG619R0v1t28z9fbKbJVVuc+o9sQohwZ3bKNzkqN0uKRC+4+WaX9BmXIKyuV02DSkc5yGdmqjoZ3jlZ4Yqa25Rfpu1yF9t/OQVu05qkhHiB6+rIcmDmx/0n0trazW7O/2aMH6HG3LL6rTUmqx1LQSjumVokt7pmje6my9fGxF69+N71nvAP6C0ko9On+TPtpQswxBz7bR2nmwWJXVHl10bqJm/XyQN1hI0p5DJXpt8S4VVVTr6gHtNercmv+fqtweXffnpVqTVaD+qbGad2dGo1tXvvwhX3e9sUYV1R4NT4/XxIEd9PHGXH2z46D3b6sxLu2ZrGeu6avYiJrAUlnt0fiXl2hbfpGuHthez13b/6TnuD2G7pu7Vh9uyFVYqFVv3DpMkQ6bHv7vRq3PLpBU835S2zJdn+gwm+w2qw4VV0qSYsJDNSWjo1zl1XX+/0lwOlTt8TQ4fjAs1Cp7iFWu8molRTk0++Yh6tUuxnv+vbX79Oh7m7zd97VsVou6Jjn1+6t6a1DHk7vQ/Y0gcwxBBifKd5UrLtLe6DfCvMJy3f7PVdq4v1AXnJOoF6/rX+8U89M5scVheHq8Zk0epJiIU49ryT5Sqpe/3KF31+yX22PoqgHt9acTQkytd1Zl6zf/2XDK14qwh+gnPZN1eZ+2SokJk91mVWiIVRZJz362zbtY4SXdk/Tsz/opLtKu8iq3tuS6tHFfoTIPlchVVqWCsioVllWpqLxK/TrE6uYRndWz3en/rjweQ/9Zs0/Pfb5dea6a6e7xkXbdmNFJH23M0fb8YtltVj0zsa+6Jjl15czvVO0x9NKkAbqiX7vTvv6JDMPQpv0uffnDATnDbN4WouiwUGUeKtE9/16jTftrWhhuGJamDfsKvN/fPSpdPz+vo+56c43WZxfIZrXoifE9tWm/S2+vqvlX628v76Hbjk2x35ZXpNcW79IH63MaDH3j+7XTjKv7nNTkf6SkUh+s269qj6E2EXa1iQxVbIRd9hCrDhVXeMeK5RWWa+P+Qm3OKTyjD98Qq6XBms7rEqffX9VH6YlOVbk9entltl74YocOFR8P7emJkRraOV7DOsdpRNeEOsHfMAw9+9k2vXqsBe2pCb30i4xOKq6o1vrsAq3ee1RvLNurA0UVslktuveSbrp7VLqW7T6i2/65UuVVHp3fLUF/+cVgFZVX6cVFO/T2yuw6rbXtY8M1aWiqDhVXas73exTlsOnj+858gO3SXYd12z9WquRHH9LnJDs1ukey2kTUtDCGHvubCAsNUXhoiCLsIQoLDVGbiFB1Tog8KfitzTqqq2d9L8OQ/nXrUJ3f7fg4MMMw9D/vbdS/V2QrNMSiv00Z4h0n5vbU/MPhmU9/8NZktUhpcRHqkujUuSlR6tM+Rr3bxSg1LlyVbo/mr92v1xbvVuahutu89GoXrTsvTNdlvVNkC7HKVV6lfUfKlHWkVFtzXVqbXaB1WUflOtYa1jXJqTk3D1GHNiffw50HivWnz7Zpz+ESHTg2RrFWaIhFv7uiV6NaoX2JIHMMQQbNVdtX3bNtdLObWY+UVJ7xtNHMQyXallekn/RMbrD/etbXu/TylzvULjZc6YmRSk90qkuiU7sOFmvB+hztO3rqhQhDQyx6aGx33Tqys1/fqCqq3frvmv169eudyj5yvKakKIf+euNgb0vXC19s1wtf7FBsRKg+f+CCk2ZV5BaWqdptyBFa88ETZgvRnsM1090XrM/RnsOlJ/3stLgIHSquUGmlW20iQvXsNf00umeyyqvc+sPHW72DSG1Wi6o9hmLCQzXr5wM1PD1BhmFoxic/6C/f1IydmpLRUfuOlmnRCYO1uyRGyn5CQHbYrLpuSJomDU31yT0tr3Jrw75Crd57VJmHipUUFab2bcLVPjZc7WLDdaCoXCszj2rlniNak3VUpZVuRTlsGtYlTsPTE5SRHq9vth/U819sV3mVR/YQq24YlqZvth/U7mMfkGlxEbrn4q4adW7SaVssDcPQ05/+4B1Pdk6yUzsPFOvE7NQlMVIvXNdffTvEeo8t231Yt8xZqdJKt85JdirrSKnKq2q6UC46N1FdEp16d82+k1oXZt4w8JQzCU9lXXaB7npjtSIdNl3ep60u79vWu41Kc/zug82a8/0epcaF69lr+inrSKn2Hi7Rhn2F+nbHIVkt0suT6q/7YFGFNuUUKrVNuNLiIk/bKu32GPpsc57eWLZX4aEhumVk50bNyvJ4DGUeLtGO/CIN75rQ6IkBldUeHSgq1x8+3qqPN9b8Q+e6wal6ckKvOi1p/kSQOYYgg7OdYRham12gD9bl6NsdB1VW6Val26OKY2MCOsVH6o8T+wZ0Jk2126MP1ufor99mKjY8VC9c37/O4Mcqt0dXzvxOm3NcGt0jSX+9cbA8hrRoa77+9m2mVuw5csrXDwu16qJzk1TtMbQlx6X9BcdD03ld4vTCdQOUElM3HH24IUcPv7tRxRXV6prk1N9uHKxOCcdnmp04ZqaWxSKN691Wd16Yrj4dYtRSVLk9yikoU/vYcNl+1PqYfaRUv52/qc7g4/hIu+69pJsmDU07o25ewzD0+4+26m8ndNO1jw3XwI5tNLRTG10zKLXeLs1Ve47optkrvV0qgzq20W/GnKthXWoGt5dXufXxxly9uTxLq/ce1U3DO+l3V/Q6o3sQCMUV1br0ucXKaWBRzT9O7KPrhqQFuCrfMgxDry3erWc++0GGIfVLjdW9F3eVxSJ5PJKhmsHVPdtG+3w6OkHmGIIMEJx+yHNp/MtLVOU2dP2QVC3bfdjb0hJitcgeYlV5tds7liM0xKILz0nU+H7tNLpHsndQpFQzMHJLrktVbo/O75bYYMtW1uFSLd5xUBP6t2vwX65vLt+rlxft1EXdE3X7+V3qzHoLFoZh6KONuZr93R6N6Jqg28/v3OCaS415rU825ckiaWDHNqedjVNr475Czf4+U5f1bqvRPZIabFkoLK1SdLitxc7uWrLjkKa+tUZRYTZ1io9Up4QIdYqP1OBOceofxNPsf2zx9oO6999rG5ydeOKyGr5CkDmGIAMEr9op+7Wiw2yafF5HTcnopJSYMBmGoYpqjyqqPLLbrA0OaAbQfFmHS/XUR1uUV1gui0WyWCyyWiSrxaJfXtBFl/ZK8enPI8gcQ5ABgle126O731yjvYdLdcOwNF0zqEOdlhYArVdjP795RwDQYtlCrPrLjYPNLgNAC8bu1wAAIGgRZAAAQNAiyAAAgKBFkAEAAEGLIAMAAIIWQQYAAAQtggwAAAhaBBkAABC0CDIAACBoBUWQmTlzpjp16qSwsDANGzZMK1asMLskAADQArT4IPP2229r+vTpeuKJJ7RmzRr169dPY8aM0YEDB8wuDQAAmKzFB5nnnntOt99+u26++Wb17NlTr732miIiIvT3v//d7NIAAIDJWnSQqays1OrVqzV69GjvMavVqtGjR2vp0qX1PqeiokIul6vOAwAAtE4tOsgcOnRIbrdbycnJdY4nJycrLy+v3ufMmDFDMTEx3kdqamogSgUAACawmV2Arz3yyCOaPn269/vCwkKlpaXRMgMAQBCp/dw2DOOU17XoIJOQkKCQkBDl5+fXOZ6fn6+UlJR6n+NwOORwOLzf194IWmYAAAg+RUVFiomJafB8iw4ydrtdgwYN0qJFi3TllVdKkjwejxYtWqRp06Y16jXatWun7OxsRUVFyWKx+Kw2l8ul1NRUZWdnKzo62mevi/pxvwOHex043OvA4V4Hjq/utWEYKioqUrt27U55XYsOMpI0ffp0TZkyRYMHD9bQoUP1wgsvqKSkRDfffHOjnm+1WtWhQwe/1RcdHc0fRQBxvwOHex043OvA4V4Hji/u9alaYmq1+CBz3XXX6eDBg3r88ceVl5en/v3769NPPz1pADAAADj7tPggI0nTpk1rdFcSAAA4e7To6dctmcPh0BNPPFFnYDH8h/sdONzrwOFeBw73OnACfa8txunmNQEAALRQtMgAAICgRZABAABBiyADAACCFkEGAAAELYJME82cOVOdOnVSWFiYhg0bphUrVphdUtCbMWOGhgwZoqioKCUlJenKK6/Utm3b6lxTXl6uqVOnKj4+Xk6nUxMnTjxpCwucuaeffloWi0X333+/9xj32nf279+vn//854qPj1d4eLj69OmjVatWec8bhqHHH39cbdu2VXh4uEaPHq0dO3aYWHFwcrvdeuyxx9S5c2eFh4crPT1dTz31VJ29erjXTfPNN99o/PjxateunSwWi+bPn1/nfGPu65EjRzR58mRFR0crNjZWt956q4qLi5tfnIEzNnfuXMNutxt///vfjc2bNxu33367ERsba+Tn55tdWlAbM2aMMXv2bGPTpk3GunXrjHHjxhlpaWlGcXGx95o777zTSE1NNRYtWmSsWrXKOO+884zhw4ebWHXwW7FihdGpUyejb9++xn333ec9zr32jSNHjhgdO3Y0brrpJmP58uXG7t27jc8++8zYuXOn95qnn37aiImJMebPn2+sX7/euOKKK4zOnTsbZWVlJlYefH7/+98b8fHxxocffmhkZmYa8+bNM5xOp/Hiiy96r+FeN83HH39sPProo8Z///tfQ5Lx3nvv1TnfmPs6duxYo1+/fsayZcuMb7/91ujatasxadKkZtdGkGmCoUOHGlOnTvV+73a7jXbt2hkzZswwsarW58CBA4YkY/HixYZhGEZBQYERGhpqzJs3z3vN1q1bDUnG0qVLzSozqBUVFRndunUzFi5caFx44YXeIMO99p2HHnrIGDlyZIPnPR6PkZKSYjz77LPeYwUFBYbD4TD+/e9/B6LEVuPyyy83brnlljrHrr76amPy5MmGYXCvfeXHQaYx93XLli2GJGPlypXeaz755BPDYrEY+/fvb1Y9dC2docrKSq1evVqjR4/2HrNarRo9erSWLl1qYmWtT2FhoSQpLi5OkrR69WpVVVXVuffdu3dXWloa976Jpk6dqssvv7zOPZW41770wQcfaPDgwfrZz36mpKQkDRgwQH/961+95zMzM5WXl1fnXsfExGjYsGHc6zM0fPhwLVq0SNu3b5ckrV+/XkuWLNFll10miXvtL425r0uXLlVsbKwGDx7svWb06NGyWq1avnx5s35+UGxR0JIcOnRIbrf7pL2ekpOT9cMPP5hUVevj8Xh0//33a8SIEerdu7ckKS8vT3a7XbGxsXWuTU5OVl5englVBre5c+dqzZo1Wrly5UnnuNe+s3v3bs2aNUvTp0/X//zP/2jlypW69957ZbfbNWXKFO/9rO89hXt9Zh5++GG5XC51795dISEhcrvd+v3vf6/JkydLEvfaTxpzX/Py8pSUlFTnvM1mU1xcXLPvPUEGLdLUqVO1adMmLVmyxOxSWqXs7Gzdd999WrhwocLCwswup1XzeDwaPHiw/vCHP0iSBgwYoE2bNum1117TlClTTK6udXnnnXf05ptv6q233lKvXr20bt063X///WrXrh33uhWja+kMJSQkKCQk5KTZG/n5+UpJSTGpqtZl2rRp+vDDD/XVV1+pQ4cO3uMpKSmqrKxUQUFBneu592du9erVOnDggAYOHCibzSabzabFixfrpZdeks1mU3JyMvfaR9q2bauePXvWOdajRw9lZWVJkvd+8p7SfL/+9a/18MMP6/rrr1efPn30i1/8Qg888IBmzJghiXvtL425rykpKTpw4ECd89XV1Tpy5Eiz7z1B5gzZ7XYNGjRIixYt8h7zeDxatGiRMjIyTKws+BmGoWnTpum9997Tl19+qc6dO9c5P2jQIIWGhta599u2bVNWVhb3/gxdcskl2rhxo9atW+d9DB48WJMnT/Z+zb32jREjRpy0jMD27dvVsWNHSVLnzp2VkpJS5167XC4tX76ce32GSktLZbXW/VgLCQmRx+ORxL32l8bc14yMDBUUFGj16tXea7788kt5PB4NGzaseQU0a6jwWWru3LmGw+Ew5syZY2zZssW44447jNjYWCMvL8/s0oLaXXfdZcTExBhff/21kZub632UlpZ6r7nzzjuNtLQ048svvzRWrVplZGRkGBkZGSZW3XqcOGvJMLjXvrJixQrDZrMZv//9740dO3YYb775phEREWG88cYb3muefvppIzY21nj//feNDRs2GBMmTGBKcBNMmTLFaN++vXf69X//+18jISHB+M1vfuO9hnvdNEVFRcbatWuNtWvXGpKM5557zli7dq2xd+9ewzAad1/Hjh1rDBgwwFi+fLmxZMkSo1u3bky/NtPLL79spKWlGXa73Rg6dKixbNkys0sKepLqfcyePdt7TVlZmXH33Xcbbdq0MSIiIoyrrrrKyM3NNa/oVuTHQYZ77TsLFiwwevfubTgcDqN79+7GX/7ylzrnPR6P8dhjjxnJycmGw+EwLrnkEmPbtm0mVRu8XC6Xcd999xlpaWlGWFiY0aVLF+PRRx81KioqvNdwr5vmq6++qvf9ecqUKYZhNO6+Hj582Jg0aZLhdDqN6Oho4+abbzaKioqaXZvFME5Y8hAAACCIMEYGAAAELYIMAAAIWgQZAAAQtAgyAAAgaBFkAABA0CLIAACAoEWQAQAAQYsgAwAAghZBBoDPdOrUSS+88EKjr//6669lsVhO2pyytTrT+wPg9GxmFwDAPKNGjVL//v199uG6cuVKRUZGNvr64cOHKzc3VzExMT75+QDOPgQZAKdkGIbcbrdsttO/XSQmJp7Ra9vtdqWkpDS1NACgawk4W910001avHixXnzxRVksFlksFu3Zs8fb3fPJJ59o0KBBcjgcWrJkiXbt2qUJEyYoOTlZTqdTQ4YM0RdffFHnNX/cdWKxWPS3v/1NV111lSIiItStWzd98MEH3vM/7lqaM2eOYmNj9dlnn6lHjx5yOp0aO3ascnNzvc+prq7Wvffeq9jYWMXHx+uhhx7SlClTdOWVV57y912yZInOP/98hYeHKzU1Vffee69KSkrq1P7UU09p0qRJioyMVPv27TVz5sw6r5GVlaUJEybI6XQqOjpa1157rfLz8+tcs2DBAg0ZMkRhYWFKSEjQVVddVed8aWmpbrnlFkVFRSktLU1/+ctfTlk3gFMjyABnqRdffFEZGRm6/fbblZubq9zcXKWmpnrPP/zww3r66ae1detW9e3bV8XFxRo3bpwWLVqktWvXauzYsRo/fryysrJO+XOefPJJXXvttdqwYYPGjRunyZMn68iRIw1eX1paqj/96U/617/+pW+++UZZWVl68MEHvef/+Mc/6s0339Ts2bP13XffyeVyaf78+aesYdeuXRo7dqwmTpyoDRs26O2339aSJUs0bdq0Otc9++yz6tevn9auXauHH35Y9913nxYuXChJ8ng8mjBhgo4cOaLFixdr4cKF2r17t6677jrv8z/66CNdddVVGjdunNauXatFixZp6NChdX7G//3f/2nw4MFau3at7r77bt11113atm3bKesHcArN3j8bQNC68MILjfvuu6/Osa+++sqQZMyfP/+0z+/Vq5fx8ssve7/v2LGj8fzzz3u/l2T89re/9X5fXFxsSDI++eSTOj/r6NGjhmEYxuzZsw1Jxs6dO73PmTlzppGcnOz9Pjk52Xj22We931dXVxtpaWnGhAkTGqzz1ltvNe644446x7799lvDarUaZWVl3trHjh1b55rrrrvOuOyyywzDMIzPP//cCAkJMbKysrznN2/ebEgyVqxYYRiGYWRkZBiTJ09usI6OHTsaP//5z73fezweIykpyZg1a1aDzwFwarTIAKjX4MGD63xfXFysBx98UD169FBsbKycTqe2bt162haZvn37er+OjIxUdHS0Dhw40OD1ERERSk9P937ftm1b7/WFhYXKz8+v08oREhKiQYMGnbKG9evXa86cOXI6nd7HmDFj5PF4lJmZ6b0uIyOjzvMyMjK0detWSdLWrVuVmppap9WqZ8+eio2N9V6zbt06XXLJJaes5cT7YbFYlJKScsr7AeDUGOwLoF4/nn304IMPauHChfrTn/6krl27Kjw8XNdcc40qKytP+TqhoaF1vrdYLPJ4PGd0vWEYZ1h9XcXFxfrlL3+pe++996RzaWlpzXrtE4WHh5/2mjO9HwBOjRYZ4Cxmt9vldrsbde13332nm266SVdddZX69OmjlJQU7dmzx78F/khMTIySk5O1cuVK7zG32601a9ac8nkDBw7Uli1b1LVr15Medrvde92yZcvqPG/ZsmXq0aOHJKlHjx7Kzs5Wdna29/yWLVtUUFCgnj17SqppbVm0aFGzf08AjUeLDHAW69Spk5YvX649e/bI6XQqLi6uwWu7deum//73vxo/frwsFosee+wxU1oS7rnnHs2YMUNdu3ZV9+7d9fLLL+vo0aOyWCwNPuehhx7Seeedp2nTpum2225TZGSktmzZooULF+qVV17xXvfdd9/pmWee0ZVXXqmFCxdq3rx5+uijjyRJo0ePVp8+fTR58mS98MILqq6u1t13360LL7zQ2w33xBNP6JJLLlF6erquv/56VVdX6+OPP9ZDDz3k35sCnMVokQHOYg8++KBCQkLUs2dPJSYmnnK8y3PPPac2bdpo+PDhGj9+vMaMGaOBAwcGsNoaDz30kCZNmqQbb7xRGRkZ3vEuYWFhDT6nb9++Wrx4sbZv367zzz9fAwYM0OOPP6527drVue5Xv/qVVq1apQEDBuh///d/9dxzz2nMmDGSarqA3n//fbVp00YXXHCBRo8erS5duujtt9/2Pn/UqFGaN2+ePvjgA/Xv318XX3yxVqxY4Z8bAUCSZDGa2/kMACbyeDzq0aOHrr32Wj311FNNfp1OnTrp/vvv1/333++74gD4HV1LAILK3r179fnnn+vCCy9URUWFXnnlFWVmZuqGG24wuzQAJqBrCUBQsVqtmjNnjoYMGaIRI0Zo48aN+uKLL7yDcgGcXehaAgAAQYsWGQAAELQIMgAAIGgRZAAAQNAiyAAAgKBFkAEAAEGLIAMAAIIWQQYAAAQtggwAAAha/w+A9MziB/uMMQAAAABJRU5ErkJggg==",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 为对比学习负采样准备词频率分布\n",
    "vocab_size = len(dataset.token2id)\n",
    "embed_size = 128\n",
    "distribution = dataset.get_word_distribution()\n",
    "print(distribution)\n",
    "model = SkipGramNCE(vocab_size, embed_size, distribution)\n",
    "\n",
    "from torch.utils.data import DataLoader\n",
    "from torch.optim import SGD, Adam\n",
    "\n",
    "# 定义静态方法collate_batch批量处理数据，转化为PyTorch可以需要的张量类型\n",
    "class DataCollator:\n",
    "    @classmethod\n",
    "    def collate_batch(cls, batch):\n",
    "        batch = np.array(batch)\n",
    "        input_ids = torch.tensor(batch[:, 0], dtype=torch.long)\n",
    "        labels = torch.tensor(batch[:, 1], dtype=torch.long)\n",
    "        return {'input_ids': input_ids, 'labels': labels}\n",
    "\n",
    "# 定义训练参数以及训练循环\n",
    "epochs = 100\n",
    "batch_size = 128\n",
    "learning_rate = 1e-3\n",
    "epoch_loss = []\n",
    "\n",
    "data_collator = DataCollator()\n",
    "dataloader = DataLoader(data, batch_size=batch_size, shuffle=True,\\\n",
    "    collate_fn=data_collator.collate_batch)\n",
    "optimizer = Adam(model.parameters(), lr=learning_rate)\n",
    "model.zero_grad()\n",
    "model.train()\n",
    "\n",
    "# 需要提前安装tqdm\n",
    "from tqdm import trange\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# 训练过程，每步读取数据，送入模型计算损失，并使用PyTorch进行优化\n",
    "with trange(epochs, desc='epoch', ncols=60) as pbar:\n",
    "    for epoch in pbar:\n",
    "        for step, batch in enumerate(dataloader):\n",
    "            loss = model(**batch)\n",
    "            pbar.set_description(f'epoch-{epoch}, loss={loss.item():.4f}')\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "            model.zero_grad()\n",
    "        epoch_loss.append(loss.item())\n",
    "    \n",
    "epoch_loss = np.array(epoch_loss)\n",
    "plt.plot(range(len(epoch_loss)), epoch_loss)\n",
    "plt.xlabel('training epoch')\n",
    "plt.ylabel('loss')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9430e9a",
   "metadata": {},
   "source": [
    "TF-IDF加权\n",
    "\n",
    "定义词频率（term frequency）。注意到不同长度的文章词频率会有较大差距，不利于比较和运算，因此可以对词频率取对数。\n",
    "\n",
    "$$\\text{tf}_{t,d} = \\log (\\text{count}(t,d) + 1)$$\n",
    "\n",
    "其中$\\text{count}(t,d)$表示词$t$在文档$d$中出现的次数，为了避免对0取对数，把所有的计数加1。\n",
    "\n",
    "那么如何区分高频词与低频词呢？TF-IDF引入了另一个重要的评价指标——文档频率（document frequency），即一个词在语料库所包含的多少篇文档中出现。在所有文档里出现的词往往是虚词或是常见实词，而只在少量文档里出现的词往往是具有明确含义的实词并且具有很强的文档区分度。用$\\text{df}_t$来表示在多少篇文档中出现了词$t$。\n",
    "\n",
    "为了压低高频词和提升低频词的影响，TF-IDF使用文档频率的倒数，也就是逆向文档频率（inverse document frequency）来对词频率进行加权。这很好理解，一个词的文档频率越高，其倒数就越小，权重就越小。\n",
    "\n",
    "$$\\text{idf}_t = \\log \\frac{N}{\\text{df}_t}$$\n",
    "\n",
    "其中$N$表示文档总数。为了避免分母为0，通常会将分母改为$\\text{df}_t+1$。\n",
    "\n",
    "基于词频率和逆向文档频率，得到TF-IDF的最终值为：\n",
    "\n",
    "$$w_{t,d} = \\text{tf}_{t,d} \\times \\text{idf}_{t}$$\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f765e353",
   "metadata": {},
   "source": [
    "很多情况下会额外对文档的TF-IDF向量使用L2归一化，使得不同文档的TF-IDF向量具有相同的模长，便于相互比较。\n",
    "下面给出了TF-IDF的代码实现。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9ce8e610",
   "metadata": {},
   "outputs": [],
   "source": [
    "class TFIDF:\n",
    "    def __init__(self, vocab_size, norm='l2', smooth_idf=True,\\\n",
    "                 sublinear_tf=True):\n",
    "        self.vocab_size = vocab_size\n",
    "        self.norm = norm\n",
    "        self.smooth_idf = smooth_idf\n",
    "        self.sublinear_tf = sublinear_tf\n",
    "    \n",
    "    def fit(self, X):\n",
    "        doc_freq = np.zeros(self.vocab_size, dtype=np.float64)\n",
    "        for data in X:\n",
    "            for token_id in set(data):\n",
    "                doc_freq[token_id] += 1\n",
    "        doc_freq += int(self.smooth_idf)\n",
    "        n_samples = len(X) + int(self.smooth_idf)\n",
    "        self.idf = np.log(n_samples / doc_freq) + 1\n",
    "    \n",
    "    def transform(self, X):\n",
    "        assert hasattr(self, 'idf')\n",
    "        term_freq = np.zeros((len(X), self.vocab_size), dtype=np.float64)\n",
    "        for i, data in enumerate(X):\n",
    "            for token in data:\n",
    "                term_freq[i, token] += 1\n",
    "        if self.sublinear_tf:\n",
    "            term_freq = np.log(term_freq + 1)\n",
    "        Y = term_freq * self.idf\n",
    "        if self.norm:\n",
    "            row_norm = (Y**2).sum(axis=1)\n",
    "            row_norm[row_norm == 0] = 1\n",
    "            Y /= np.sqrt(row_norm)[:, None]\n",
    "        return Y\n",
    "    \n",
    "    def fit_transform(self, X):\n",
    "        self.fit(X)\n",
    "        return self.transform(X)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
